Overview

This document contains a reproducible query that searches for the terms “disinformation” and “misinformation” in the titles and abstracts of peer-reviewed articles published between 2000 and 2025. The data are retrieved from Semantic Scholar API and cleaned for encoding errors and duplicate results.

Two tables are constructed with the results:

Annual Publication Counts: A summary table displaying the count of articles containing “disinformation” compared to “misinformation” for each year, alongside the total article count.

Top Cited Literature: A list of the most influential papers retrieved, ranked by citation count and including metadata such as authors, year, and Semantic Scholar URL.

Environment Setup

library(httr2)
library(purrr)
library(dplyr)
library(tibble)
library(stringi)
library(DT)
library(stringr)
library(knitr)
library(kableExtra)

Text Processing

# UTF-8 text normalizaton
sanitize_utf8 <- function(x) {
  stringi::stri_enc_toutf8(x, is_unknown_8bit = TRUE)
}

Search Parameters

# API Query Builder
search_semantic_scholar <- function(query,
                                    offset   = 0,
                                    limit    = 100,
                                    year_min = 2000,
                                    year_max = 2025) {

  # Validate API key
  api_key <- Sys.getenv("SS_API_KEY")
  if (api_key == "") {
    stop("Semantic Scholar API key not found in environment variable SS_API_KEY")
  }
  # Time constraints
  limit  <- min(limit, 100)       # Hard cap on results per request
  offset <- max(offset, 0)
  resp <- tryCatch(
    {
      request("https://api.semanticscholar.org/graph/v1/paper/search") %>%
        req_headers(
          "x-api-key" = api_key,
          "Accept"    = "application/json") %>%
        req_url_query(
          query  = query,
          offset = offset,
          limit  = limit,
          year   = paste0(year_min, "-", year_max),
          fields = paste(
            c("title",
              "abstract",
              "authors.name",
              "year",
              "citationCount",
              "url"),
            collapse = ",")) %>%
        req_retry(
          max_tries = 3,
          backoff   = function(i) 2 ^ i) %>%    # Exponential back off
        req_perform() %>%
        resp_body_json()
    },
    error = function(e) NULL)
  # Defensive logic
  if (is.null(resp) || is.null(resp$data) || length(resp$data) == 0) {
    return(tibble(
      abstract  = character(),
      title     = character(),
      authors   = character(),
      year      = integer(),
      citations = integer(),
      link      = character()))
  }
  # Normal success path
  tibble(
    abstract = sanitize_utf8(
      map_chr(resp$data, "abstract", .default = NA_character_)),
    title = sanitize_utf8(
      map_chr(resp$data, "title", .default = NA_character_)),
    authors = sanitize_utf8(
      map_chr(
        resp$data,
        ~ paste(map_chr(.x$authors, "name"), collapse = ", "),
        .default = NA_character_)),
    year = map_int(resp$data, "year", .default = NA_integer_),
    citations = map_int(resp$data, "citationCount", .default = NA_integer_),
    link = map_chr(resp$data, "url", .default = NA_character_))
}

Pagination and Data Retrieval

# Rate-Limited Paginated Data Retrieval

# Retrieval query definitions
queries <- c(
  "misinformation",
  "disinformation")

# Safe paginated fetch for one query
fetch_query_pages <- function(query,
                              year_min = 2000,
                              year_max = 2025,
                              limit    = 100,
                              sleep_rng = c(1.5, 3)) {
  offset <- 0
  out    <- list()
  repeat {
    Sys.sleep(runif(1, sleep_rng[1], sleep_rng[2]))

    res <- search_semantic_scholar(
      query    = query,
      offset   = offset,
      limit    = limit,
      year_min = year_min,
      year_max = year_max)
    # stop when API returns nothing
    if (nrow(res) == 0) break
    out[[length(out) + 1]] <- res %>%
      mutate(retrieval_query = query,
             retrieval_offset = offset)
    offset <- offset + limit
  }
  bind_rows(out)
}

# Run across all queries
results_raw <- map(
  queries,
  fetch_query_pages,
  year_min = 2000,
  year_max = 2025) %>%
  list_rbind()

Annual Publication Counts

# Output 1: Year-level article counts table
#   A table with one row per year and columns:
#     - Year published
#     - Articles with mentions of 'disinformation' in title or abstract
#     - Articles with mentions of 'misinformation' in title or abstract
#     - Total number of articles

article_counts <- results_raw %>%
  distinct(title, year, .keep_all = TRUE) %>%   # ← ADD HERE
  mutate(
    text = str_to_lower(paste(title, coalesce(abstract, ""))),
    has_disinfo = str_detect(text, "disinformation"),
    has_misinfo = str_detect(text, "misinformation")) %>%
  group_by(year) %>%
  summarise(
    disinformation_titles = sum(has_disinfo, na.rm = TRUE),
    misinformation_titles = sum(has_misinfo, na.rm = TRUE),
    total_titles = n(),
    .groups = "drop") %>%
  arrange(desc(year))

article_counts_dt <- article_counts %>%
  select(
    year,
    disinformation_titles,
    misinformation_titles,
    total_titles) %>%
  as.data.frame()

DT::datatable(
  article_counts_dt,
  rownames = FALSE,
  extensions = "Buttons",
  options = list(
    pageLength = 15,
    autoWidth = TRUE,
    order = list(list(0, "desc")),
    dom = "Bfrtip",
    buttons = c("copy", "csv", "excel"),
    columnDefs = list(
      list(className = "dt-center", targets = "_all"))),
  colnames = c(
    "Year",
    "Articles mentioning 'Disinformation'",
    "Articles mentioning 'Misinformation'",
    "Total Articles"))

Top Cited Literature

# Output 2:
#   One row per paper:
#     - Title
#     - Authors
#     - Year
#     - Citation Count
#     - Semantic Scholar URL
results_display <- results_raw %>%
  distinct(title, year, .keep_all = TRUE) %>%
  mutate(
    text = str_to_lower(paste(title, coalesce(abstract, ""))),
    has_disinformation = str_detect(text, "disinformation"),
    has_misinformation = str_detect(text, "misinformation")) %>%
  arrange(desc(citations)) %>%
  transmute(
    title,
    authors,
    year,
    citations,
    link = sprintf('<a href="%s" target="_blank">link</a>', link))

DT::datatable(
  results_display,
  escape = FALSE,
  rownames = FALSE,
  extensions = "Buttons",
  options = list(
    pageLength = 5,
    autoWidth = FALSE,
    dom = "Bfrtip",
    buttons = c("copy", "csv", "excel"),
    order = list(list(3, "desc")),   # citations
    columnDefs = list(
      list(width = "450px", targets = 0),      # title
      list(width = "200px", targets = 1),      # authors
      list(width = "80px",  targets = 2),      # year
      list(width = "90px",  targets = 3),      # citations
      list(width = "70px",  targets = 4))),    # link
    
  colnames = c(
    "Title",
    "Authors",
    "Year",
    "Citations",
    "Link"))